PilotD2Redo - QC and exploratory analysis¶

In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import os
import yaml
import scanpy as sc
import rapids_singlecell as rsc
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.autolayout'] = True

# Increase all font sizes
plt.rcParams['font.size'] = 16  # Base font size
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['xtick.labelsize'] = 15
plt.rcParams['ytick.labelsize'] = 15
plt.rcParams['legend.fontsize'] = 15

from preprocess import _convert_oak_path
import qc_plots
/home/emmadann/.local/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

Configuration

In [3]:
# Read config
experiment_name = 'PilotD2Redo_Lane2'
config_file = '../../metadata/experiments_config.yaml'
with open(config_file, 'r') as f:
    config = yaml.safe_load(f)

config = config[experiment_name]
datadir = _convert_oak_path(config['datadir'])
sample_metadata_csv = _convert_oak_path(config['sample_metadata'])

PLOTDIR = f'../../results/{experiment_name}/'
sc.settings.figdir = PLOTDIR

def save_plot(pl_name, plot_dir = None):
    if plot_dir is None:
        plot_dir = PLOTDIR
    plt.savefig(f'{plot_dir}/{pl_name}.pdf');
    plt.savefig(f'{plot_dir}/{pl_name}.png');

Read merged dataset¶

In [4]:
sample_metadata = pd.read_csv(sample_metadata_csv, index_col=0)
sgrna_library_metadata = pd.read_csv('../../metadata/sgRNA_library_curated.csv', index_col=0)
sample_metadata
Out[4]:
experiment_id cell_sample_id donor_id culture_condition library_id library_prep_kit probe_hyb_loading GEM_loading
0 PilotD2Redo_Lane2 PilotD2Redo_Day7 CE0010216 Day7 PilotD2Redo_Lane2_Day7 GEMX_flex_v2 1M cells, 10 uL probe 3x
1 PilotD2Redo_Lane2 PilotD2Redo_Rest CE0010216 Rest PilotD2Redo_Lane2_Rest GEMX_flex_v2 1M cells, 10 uL probe 3x
2 PilotD2Redo_Lane2 PilotD2Redo_Restim6hr CE0010216 Restim6hr PilotD2Redo_Lane2_Restim6hr GEMX_flex_v2 1M cells, 10 uL probe 3x
3 PilotD2Redo_Lane2 PilotD2Redo_Restim24hr CE0010216 Restim24hr PilotD2Redo_Lane2_Restim24hr GEMX_flex_v2 1M cells, 10 uL probe 3x
In [43]:
adata = sc.read_h5ad(f'{datadir}/{experiment_name}_merged.gex.lognorm.h5ad', backed=False)
adata.var = adata.var[['gene_ids', 'gene_name', 'mt']].copy()
adata.var_names = adata.var[ 'gene_name'].values
In [44]:
adata
Out[44]:
AnnData object with n_obs × n_vars = 668085 × 18129
    obs: 'cell_sample_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'n_genes', 'batch', 'Unnamed: 0', 'experiment_id', 'donor_id', 'culture_condition', 'library_id', 'library_prep_kit', 'probe_hyb_loading', 'GEM_loading'
    var: 'gene_ids', 'gene_name', 'mt'
    uns: 'hvg', 'log1p', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'
In [45]:
if 'guide_id' not in adata.obs:
    # Load assignment of sgRNAs to cells
    sgrna_assignment = pd.read_csv(f'{datadir}/sgrna_assignment.csv', index_col=0)
    sgrna_assignment_index = sgrna_assignment.index.tolist()
    sgrna_assignment = pd.merge(sgrna_assignment, sgrna_library_metadata.rename({"sgrna_id":'guide_id'}, axis=1), how='left')
    sgrna_assignment.index = sgrna_assignment_index

    existing_cols = [col for col in sgrna_assignment.columns if col in adata.obs.columns]
    adata.obs.drop(columns=existing_cols, inplace=True)
    adata.obs = pd.concat([adata.obs, sgrna_assignment.loc[adata.obs_names]], axis=1)
    adata.obs['guide_type'] = np.where(adata.obs['guide_id'].str.startswith('NTC-'), 'non-targeting', 'targeting')

QC metrics¶

In [7]:
qc_plots.plot_ncells_sample(adata);
save_plot(f'{experiment_name}_n_cells_preQC')
No description has been provided for this image
In [73]:
sc.pl.violin(adata, 'log1p_total_counts', groupby='cell_sample_id', rotation=90)
No description has been provided for this image
In [71]:
fig, axs = plt.subplots(4,1, figsize=(5,10))
for i,m in enumerate(['log1p_total_counts','total_counts', 'n_genes', 'pct_counts_mt']):
    sc.pl.violin(adata, m, groupby='cell_sample_id', rotation=90, show=False, ax=axs[i]);
    if i != 3:
        # remove x-axis tick labels
        axs[i].set_xticklabels([])

fig.tight_layout()
fig.show()
No description has been provided for this image
In [69]:
sc.pl.scatter(adata, 'log1p_total_counts', 'log1p_n_genes_by_counts', color='culture_condition')
sc.pl.scatter(adata, 'log1p_n_genes_by_counts', 'pct_counts_in_top_100_genes', color='culture_condition', size=3)
No description has been provided for this image
No description has been provided for this image
In [13]:
adata.obs[['total_counts', 'n_genes', 'pct_counts_mt', 'cell_sample_id']].groupby('cell_sample_id').mean()
Out[13]:
total_counts n_genes pct_counts_mt
cell_sample_id
PilotD2Redo_Day7 14184.199219 4703.165669 0.612662
PilotD2Redo_Rest 7462.265137 3313.420798 0.550297
PilotD2Redo_Restim6hr 7886.190430 3294.112534 0.364597
PilotD2Redo_Restim24hr 7778.331055 3401.161530 0.824780

Estimate fraction of low quality cells (high fraction of mitochondrial genes, low number of captured genes)

In [51]:
adata.obs['low_quality'] = (adata.obs['pct_counts_mt'] > 5) | (adata.obs['n_genes'] < 1000)
In [52]:
# plot fraction of low_quality cells for each sample_id
low_quality_df = adata.obs.groupby(['cell_sample_id', 'culture_condition'])['low_quality'].mean().reset_index(name='fraction_low_quality')
low_quality_df
sns.barplot(data=low_quality_df, x='cell_sample_id', y='fraction_low_quality', hue='culture_condition', dodge=False);
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
plt.xticks(rotation=90);
No description has been provided for this image

sgRNA assignment QC metrics¶

In [74]:
import upsetplot
from upsetplot import from_contents
In [75]:
all_crispr_a = {}
for sample_id in sample_metadata['cell_sample_id']:
    sgrna_h5ad = f"{datadir}{sample_id}.sgRNA.h5ad"
    crispr_a = sc.read_h5ad(sgrna_h5ad)
    all_crispr_a[sample_id] = crispr_a
In [30]:
# Get lists of inefficient and nonspecific guides for each sample
all_inefficient = {k:x.var_names[x.var['inefficient']].tolist() for k,x in all_crispr_a.items()}
all_nonspecific = {k:x.var_names[x.var['nonspecific']].tolist() for k,x in all_crispr_a.items()}

inef = from_contents(all_inefficient)
ax_dict = upsetplot.UpSet(inef, subset_size="count").plot()

nons = from_contents(all_nonspecific)
ax_dict = upsetplot.UpSet(nons).plot()
No description has been provided for this image
No description has been provided for this image
In [82]:
pl_df = adata.obs[['guide_id', 'cell_sample_id', 'low_quality']]
pl_df['group'] = 'targeting single sgRNA'
pl_df['group'] = np.where(adata.obs['guide_id'].str.startswith('NTC'), 'NTC single sgRNA', pl_df['group'])
pl_df['group'] = np.where(adata.obs['guide_id'].isna(), 'no sgRNA (>= 3 UMIs)', pl_df['group'])
pl_df['group'] = np.where(adata.obs['guide_id']  == 'multi_sgRNA', 'multi sgRNA (>= 3 UMIs)', pl_df['group'])
pl_df = pl_df[~pl_df['low_quality']].copy()
# Plot stacked barplot of fraction of cells in each group for each sample

group_counts = pl_df.groupby(['cell_sample_id', 'group']).size().unstack()
group_fractions = group_counts.div(group_counts.sum(axis=1), axis=0)
group_fractions.plot(kind='bar', stacked=True, figsize=(4,5))
plt.xticks(rotation=45, ha='right')
plt.ylabel('Fraction of cells')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title("3x loading");
plt.tight_layout()
No description has been provided for this image
In [83]:
group_fractions
Out[83]:
group NTC single sgRNA multi sgRNA (>= 3 UMIs) no sgRNA (>= 3 UMIs) targeting single sgRNA
cell_sample_id
PilotD2Redo_Day7 0.018776 0.344840 0.134183 0.502200
PilotD2Redo_Rest 0.020919 0.284717 0.181568 0.512796
PilotD2Redo_Restim6hr 0.020682 0.304761 0.176436 0.498120
PilotD2Redo_Restim24hr 0.021999 0.268535 0.195860 0.513607
In [51]:
# no_guide_cells = adata.obs_names[adata.obs['guide_id'].isna()]
# multi_guide_cells = adata.obs_names[adata.obs['guide_id'] == 'multi_sgRNA']

# fig, axes = plt.subplots(1, 4, figsize=(20, 5))
# for i, (sample_id, crispr_a) in enumerate(all_crispr_a.items()):
#     crispr_a.obs['multi_sgrna'] = crispr_a.obs_names.isin(multi_guide_cells)
#     crispr_a.obs['no_sgrna'] = crispr_a.obs_names.isin(no_guide_cells)
#     crispr_a.obs['targeting_umis'] = np.array(crispr_a[:, crispr_a.var['sgrna_type'] == 'targeting'].X.sum(axis=1)).flatten()
#     crispr_a.obs['ntc_umis'] = np.array(crispr_a[:, crispr_a.var['sgrna_type'] == 'NTC'].X.sum(axis=1)).flatten()
#     crispr_a.obs['inefficient_umis'] = np.array(crispr_a[:, crispr_a.var['inefficient']].X.sum(axis=1)).flatten()
#     crispr_a.obs['nonspecific_umis'] = np.array(crispr_a[:, crispr_a.var['nonspecific']].X.sum(axis=1)).flatten()

#     ntc_n = sum(no_sgrna_a.obs['ntc_umis'] > 3)
#     tot_n = no_sgrna_a.n_obs
#     subtitle = f'{ntc_n}/{tot_n} multi sgRNA cells with NTC sgRNAs'
#     no_sgrna_a = crispr_a[crispr_a.obs['multi_sgrna']].copy()
    
#     axes[i].scatter(no_sgrna_a.obs['ntc_umis'], no_sgrna_a.obs['targeting_umis'], s=3)
#     axes[i].set_xscale('log')
#     axes[i].set_yscale('log')
#     axes[i].set_xlabel('NTC sgRNA UMI counts')
#     axes[i].set_ylabel('Targeting sgRNA UMI counts')
#     axes[i].set_title('Multi sgRNA cells - ' + crispr_a.obs['cell_sample_id'][0] + '\n' + subtitle)

# plt.tight_layout()
# plt.show()

Dimensionality reduction¶

In [16]:
sc.pl.pca(adata, annotate_var_explained=True, color=['culture_condition'], components=['1,2'], wspace=0.5, ncols=3, size=2)
No description has been provided for this image
In [17]:
sc.pl.pca(adata, annotate_var_explained=True, color=['culture_condition', 'pct_counts_mt', 'log1p_n_genes_by_counts'], components=['1,2', '3,4', '5,6'], wspace=0.5, ncols=3, size=3)
No description has been provided for this image

Visualize genes with top loadings for each PC

In [18]:
sc.pl.pca_loadings(adata, components='1,2,3');
sc.pl.pca_loadings(adata, components='4,5,6')
No description has been provided for this image
No description has been provided for this image

Clustering¶

In [46]:
rsc.pp.neighbors(adata, n_neighbors=50)
rsc.tl.umap(adata)
rsc.tl.louvain(adata)
In [47]:
# adata.write_h5ad()
In [64]:
adata.obs.columns
Out[64]:
Index(['cell_sample_id', 'n_genes_by_counts', 'log1p_n_genes_by_counts',
       'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes',
       'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes',
       'pct_counts_in_top_500_genes', 'total_counts_mt',
       'log1p_total_counts_mt', 'pct_counts_mt', 'n_genes', 'batch',
       'Unnamed: 0', 'experiment_id', 'donor_id', 'culture_condition',
       'library_id', 'library_prep_kit', 'probe_hyb_loading', 'GEM_loading',
       'guide_id', 'top_guide_umi_counts', 'sequence', 'perturbed_gene_name',
       'perturbed_gene_id', 'guide_type', 'louvain', 'low_quality'],
      dtype='object')
In [65]:
sc.pl.umap(adata, color=['cell_sample_id'] + ['log1p_total_counts','log1p_n_genes_by_counts','pct_counts_mt', 'pct_counts_in_top_100_genes'], wspace=0.4, size=3, ncols=2)
No description has been provided for this image
In [53]:
sc.pl.umap(adata, color=['cell_sample_id'] + ['low_quality'], wspace=0.4, size=3)
No description has been provided for this image
In [50]:
qc_plots.plot_markers_sample_dotplot(adata, save=f'{experiment_name}_markers_sample_dotplot')
WARNING: saving figure to file ../../results/PilotD2Redo_Lane2/dotplot_PilotD2Redo_Lane2_markers_sample_dotplot.pdf
No description has been provided for this image
In [54]:
qc_plots.plot_markers_umap(adata, wspace=0.1, size=3, cmap='magma', ncols=5, save=f'{experiment_name}_markers.png')
WARNING: saving figure to file ../../results/PilotD2Redo_Lane2/umapPilotD2Redo_Lane2_markers.png
No description has been provided for this image
In [55]:
sc.pl.umap(adata, color='louvain', legend_loc='on data')
No description has been provided for this image
In [56]:
sc.pl.violin(adata, ['log1p_total_counts', 'pct_counts_mt'], groupby='louvain')
No description has been provided for this image
In [59]:
# plot fraction of low_quality cells for each cluster
low_quality_df = adata.obs.groupby(['cell_sample_id', 'louvain'])['low_quality'].mean().reset_index(name='fraction_low_quality')

sns.barplot(data=low_quality_df, x='louvain', y='fraction_low_quality', hue='louvain', dodge=False);
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.);
plt.xticks(rotation=90);
No description has been provided for this image
In [58]:
group_by = 'culture_condition'
markers_dict = qc_plots.load_Tcell_markers(adata)
order_samples = adata.obs.sort_values([group_by])['louvain'].astype(str).unique()
adata.obs['louvain'] = adata.obs['louvain'].cat.reorder_categories(order_samples)
sc.pl.dotplot(adata, markers_dict, groupby='louvain')
No description has been provided for this image
In [62]:
low_quality_clusters = ['3', '13']
sc.pl.umap(adata, color='louvain', legend_loc='on data', size=5, groups=low_quality_clusters)
sc.pl.umap(adata, color=['low_quality'], wspace=0.4, size=3, groups=[True])
No description has been provided for this image
No description has been provided for this image
In [67]:
adata_postqc = adata[~adata.obs['louvain'].isin(low_quality_clusters)].copy()
In [68]:
adata_postqc.write_h5ad(f'{datadir}/{experiment_name}_merged.gex.lognorm.postQC.h5ad')

Knock-down efficiency¶

In [84]:
sc.pl.umap(adata, color='guide_type')
No description has been provided for this image
In [85]:
guide_cell_counts = adata.obs['guide_id'].value_counts()
guide_cell_counts.index = guide_cell_counts.index.astype(str)
plt.subplot(1,2,1);
plt.plot(guide_cell_counts[1:].values);
plt.xlabel('sgRNA rank');
plt.ylabel('N cells with unique sgRNA');


gene_cell_counts = adata.obs['perturbed_gene_name'].value_counts()
gene_cell_counts.index = gene_cell_counts.index.astype(str)

plt.subplot(1,2,2);
plt.plot(gene_cell_counts[1:].values);
plt.xlabel('perturbed gene rank');
plt.ylabel('N cells with unique perturbed gene');
No description has been provided for this image
In [86]:
perturbed_gene_expr_df = qc_plots.calculate_perturbed_gene_expression(adata)
kd_results_c = qc_plots.test_knockdown_simple(perturbed_gene_expr_df)
In [87]:
kd_results_c.to_csv(f'{datadir}/knockdown_efficacy_simple.csv')
In [98]:
kd_results_c['signif_knockdown'].shape[0]
Out[98]:
12764
In [97]:
kd_results_c['signif_knockdown'].value_counts()
Out[97]:
signif_knockdown
True     8886
False    3878
Name: count, dtype: int64
In [ ]:
# # Plot cumulative distributions
# plt.plot(np.sort(perturbed_gene_expr_df['perturbed_gene_expr']),
#          np.linspace(0, 1, len(perturbed_gene_expr_df['perturbed_gene_expr'])), label='Targeting')
# plt.plot(np.sort(perturbed_gene_expr_df['perturbed_gene_mean_ntc']),
#          np.linspace(0, 1, len(perturbed_gene_expr_df['perturbed_gene_mean_ntc'])), label='NTC mean')
# plt.xlabel('Expression of perturbed gene');
# plt.ylabel('Cumulative fraction');
# plt.legend()
In [100]:
mean_perturbed_gene_expr_df = kd_results_c
order_genes = mean_perturbed_gene_expr_df.sort_values('perturbed_gene_mean_ntc', ascending=False).perturbed_gene.astype(str).values
mean_perturbed_gene_expr_df['perturbed_gene'] = pd.Categorical(mean_perturbed_gene_expr_df['perturbed_gene'], categories=order_genes, ordered=True)

pl_df = mean_perturbed_gene_expr_df[mean_perturbed_gene_expr_df['perturbed_gene'].isin(order_genes[0:1000])].sort_values('perturbed_gene')
plt.figure(figsize=(20,6))
# Plot mean NTC expression with error bars
plt.errorbar(pl_df['perturbed_gene'],
             pl_df['perturbed_gene_mean_ntc'], 
             yerr=pl_df['perturbed_gene_std_ntc'],
             fmt='o',
             color='grey',
             markersize=2,
             alpha=0.5,
             label='NT control cells')

# Add stripplot of perturbed expression
plt.errorbar(pl_df[pl_df.signif_knockdown]['perturbed_gene'],
           pl_df[pl_df.signif_knockdown]['perturbed_gene_expr'],
           yerr=pl_df[pl_df.signif_knockdown]['perturbed_gene_expr_std'],
           fmt='o',
           markersize=2,
           color='red',
           alpha=0.5,
           label='Perturbed cells (significant KD at 10% FDR)')
plt.errorbar(pl_df[~pl_df.signif_knockdown]['perturbed_gene'],
           pl_df[~pl_df.signif_knockdown]['perturbed_gene_expr'],
           yerr=pl_df[~pl_df.signif_knockdown]['perturbed_gene_expr_std'],
           fmt='o',
           markersize=2,
           color='blue',
           alpha=0.5,
           label='Perturbed cells (non significant KD)')

plt.xticks([])
plt.ylim(0)
plt.xlabel('Perturbed gene')
plt.ylabel('Perturbed gene expression (log-normalized counts)')
plt.legend(frameon=False);
plt.tight_layout()
No description has been provided for this image
In [101]:
plt.figure(figsize=(20,6))
mean_perturbed_gene_expr_df['rank'] = mean_perturbed_gene_expr_df['perturbed_gene_mean_ntc'].rank()
# # Plot mean NTC expression with error bars
plt.errorbar(mean_perturbed_gene_expr_df['rank'],
             mean_perturbed_gene_expr_df['perturbed_gene_mean_ntc'], 
             yerr=mean_perturbed_gene_expr_df['perturbed_gene_std_ntc'],
             fmt='o',
             color='grey',
             markersize=1, 
             alpha=0.1,
             label='NT control cells')
# # Add stripplot of perturbed expression
# plt.errorbar(mean_perturbed_gene_expr_df['rank'],
#            mean_perturbed_gene_expr_df['perturbed_gene_expr'],
#            yerr=mean_perturbed_gene_expr_df['perturbed_gene_expr_std'],
#            fmt='o',
#              markersize=1, 
#              alpha=0.1,
#            color='red',
#            label='Perturbed cells (significant KD at 10% FDR)')
# # plt.errorbar(mean_perturbed_gene_expr_df[~mean_perturbed_gene_expr_df.signif_knockdown]['rank'],
# #            mean_perturbed_gene_expr_df[~mean_perturbed_gene_expr_df.signif_knockdown]['perturbed_gene_expr'],
# #            yerr=mean_perturbed_gene_expr_df[~mean_perturbed_gene_expr_df.signif_knockdown]['perturbed_gene_expr_std'],
# #            fmt='o',
# #             markersize=1, 
# #              alpha=0.1,
# #            color='blue',
# #            label='Perturbed cells (non significant KD)')

# sns.scatterplot(y=mean_perturbed_gene_expr_df['perturbed_gene_mean_ntc'], x=mean_perturbed_gene_expr_df['perturbed_gene_mean_ntc'].rank(), s=1, color='grey', label='NTC cells', linewidth=0);
sns.scatterplot(y=mean_perturbed_gene_expr_df['perturbed_gene_expr'], x=mean_perturbed_gene_expr_df['perturbed_gene_mean_ntc'].rank(), s=3, label='Targeting cells', color='red', linewidth=0);
plt.ylim(0)
plt.xlabel('Perturbed gene')
plt.ylabel('Perturbed gene expression (log-normalized counts)')
Out[101]:
Text(0, 0.5, 'Perturbed gene expression (log-normalized counts)')
No description has been provided for this image
In [102]:
all_conditions = adata.obs['culture_condition'].unique()
all_conditions_tests = {}
for c in all_conditions:
    print(c)
    ad_c = adata[adata.obs['culture_condition'] == c].copy()
    perturbed_gene_expr_df = qc_plots.calculate_perturbed_gene_expression(ad_c)
    kd_results_c = qc_plots.test_knockdown_simple(perturbed_gene_expr_df)
    kd_results_c['culture_condition'] = c
    all_conditions_tests[c] = kd_results_c
Day7
Rest
Restim6hr
Restim24hr
In [103]:
for k in all_conditions_tests.keys():
    print(all_conditions_tests[k].signif_knockdown.value_counts())
signif_knockdown
True     7297
False    5340
Name: count, dtype: int64
signif_knockdown
True     7427
False    5058
Name: count, dtype: int64
signif_knockdown
True     7284
False    5182
Name: count, dtype: int64
signif_knockdown
True     7463
False    4896
Name: count, dtype: int64